{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "9a2fc3e2-9a78-4d18-b809-391b7d9a9430",
   "metadata": {},
   "source": [
    "### PhyloP max score over SV "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f3cd2ccf-692a-4ddb-918c-a4a6ac8c5e0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path \n",
    "import pyBigWig\n",
    "import numpy as np\n",
    "import pandas as pd "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "416de8ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "phylop_scores_bw = wkdir_path.joinpath(\"reference/conservation/phylop/hg38.phyloP100way.bw\")\n",
    "sv_bed_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed\")\n",
    "out_dir = wkdir_path.joinpath(\"5_misexp_vrnts/scores/phylop\")\n",
    "out_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "30445b22",
   "metadata": {},
   "outputs": [],
   "source": [
    "bed_columns={0:\"chrom\", 1:\"start\", 2:\"end\", 3:\"vrnt_id\"}\n",
    "sv_bed_df = pd.read_csv(sv_bed_path, sep=\"\\t\", header=None).rename(columns=bed_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cd11db72",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/nfs/users/nfs_t/tv5/.conda/envs/tv5_base/lib/python3.7/site-packages/ipykernel_launcher.py:15: RuntimeWarning: All-NaN axis encountered\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    }
   ],
   "source": [
    "phylop_bw = pyBigWig.open(str(phylop_scores_bw))\n",
    "phylop_consv_vrnt_results = {}\n",
    "for index, row in sv_bed_df.iterrows():\n",
    "    chrom = row['chrom']\n",
    "    start = row['start']\n",
    "    end = row['end']\n",
    "    vrnt_id = row[\"vrnt_id\"]\n",
    "    # query BigWig file\n",
    "    values = phylop_bw.values(chrom, start, end)\n",
    "    if len(values) != end - start: \n",
    "        print(vrnt_id)\n",
    "    # ignores NaNs\n",
    "    max_value = np.nanmax(values) if values is not None else None\n",
    "    # add max to dictionary \n",
    "    phylop_consv_vrnt_results[index] = [chrom, start, end, vrnt_id, max_value]\n",
    "# close the BigWig file\n",
    "phylop_bw.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6002bac4",
   "metadata": {},
   "outputs": [],
   "source": [
    "pylop_results_columns=[\"chrom\", \"start\", \"end\", \"vrnt_id\", \"phylop_max\"]\n",
    "phylop_consv_vrnt_df = pd.DataFrame.from_dict(phylop_consv_vrnt_results, orient=\"index\", columns=pylop_results_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e8717347",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of varints annotated: 20255\n",
      "Number of variants with no PhyloP score: 193\n"
     ]
    }
   ],
   "source": [
    "vrnt_id_in_phylop = phylop_consv_vrnt_df.vrnt_id.unique()\n",
    "print(f\"Number of varints annotated: {len(vrnt_id_in_phylop)}\")\n",
    "vrnt_id_in_phylop_nan = phylop_consv_vrnt_df[phylop_consv_vrnt_df.phylop_max.isna()]\n",
    "print(f\"Number of variants with no PhyloP score: {len(vrnt_id_in_phylop_nan)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d380e441",
   "metadata": {},
   "outputs": [],
   "source": [
    "phylop_consv_vrnt_path = out_dir.joinpath(\"sv_in_windows_phylop_metrics.tsv\")\n",
    "phylop_consv_vrnt_df.to_csv(phylop_consv_vrnt_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b636697b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
